Read data
Get list of samples
Code
samples <- read_tsv ("config/samples.tsv" ,
col_types = list (
sample_name = col_character (),
cell_line = col_factor (),
exogenous_rna = col_factor (),
day = col_factor ()
)
)
units <- read_tsv ("config/units.tsv" ,
col_types = list (
sample_name = col_character (),
unit_name = col_character (),
fq1 = col_character (),
fq2 = col_character ()
)
)
sample_units <- dplyr:: left_join (samples, units, by = "sample_name" ) %>%
unite (sample_unit, sample_name, unit_name, remove = FALSE ) %>%
mutate (
day = as.factor (paste0 ("day" , day)),
batch = as.factor (case_when (
exogenous_rna == "mastermix1" ~ "batch3" ,
exogenous_rna == "mastermix2" ~ "batch3" ,
TRUE ~ "batch2"
)),
.before = cell_line,
)
sample_units
Table of exogenous RNA mixtures
Code
# Exogenous RNA mixtures
rna_mixes <- tibble ()
for (mix in c ("mastermix1" , "mastermix2" , "PJY103_mDNMT1" , "PJY300_mDNMT1" )) {
t <- Biostrings:: readDNAStringSet (sprintf ("data/references/%s.fa" , mix))
rna_mixes <- rbind (rna_mixes, tibble (
exogenous_rna = mix,
rna_species = word (t@ ranges@ NAMES, 1 ),
start = 1 ,
end = t@ ranges@ width
))
}
rna_mixes
Biotype comparison
count only fragments that were properly aligned
annotate with GENCODE gene model
primary alignments were counted, even if the fragments aligned multiple times
fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
Code
human_counts_dir <- "results/smrna_count/"
biotype_counts_files <- paste0 (
human_counts_dir,
sample_units$ sample_unit,
"_first_proper_pair_biotype_count.txt"
)
exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0 (
exogenous_counts_dir,
sample_units$ sample_unit,
"_idxstats.txt"
)
biotype_counts <- readr:: read_tsv (
biotype_counts_files[1 ],
comment = "#" ,
col_names = c ("biotype" , biotype_counts_files[1 ]),
col_types = "ci"
)
exogenous_counts <- read_tsv (
exogenous_counts_files[1 ],
col_names = c ("exogenous_rna" , exogenous_counts_files[1 ]),
col_types = "c-i-"
)
biotype_counts <- biotype_counts %>%
dplyr:: add_row (
biotype = "exogenous_rna" ,
"{biotype_counts_files[1]}" : = sum (exogenous_counts[[exogenous_counts_files[1 ]]])
)
for (i in 2 : length (biotype_counts_files)) {
biotype_sample <-
readr:: read_tsv (
biotype_counts_files[i],
comment = "#" ,
col_names = c ("biotype" , biotype_counts_files[i]),
col_types = "ci"
)
exogenous_counts_sample <-
readr:: read_tsv (
exogenous_counts_files[i],
col_names = c ("exogenous_rna" , exogenous_counts_files[i]),
col_types = "c-i-"
)
biotype_sample <- biotype_sample %>%
dplyr:: add_row (
biotype = "exogenous_rna" ,
"{biotype_counts_files[i]}" : =
sum (exogenous_counts_sample[[exogenous_counts_files[i]]])
)
biotype_counts <- biotype_counts %>%
dplyr:: full_join (biotype_sample, by = "biotype" )
}
biotype_counts <- biotype_counts %>%
rename_all (~ stringr:: str_replace_all (
., human_counts_dir, ""
)) %>%
rename_all (~ stringr:: str_replace_all (., "_first_proper_pair_biotype_count.txt" , "" )) %>%
tidyr:: pivot_longer (! biotype, names_to = "sample" , values_to = "count" )
biotype_counts
Code
p <- ggplot (
data = subset (biotype_counts, ! is.na (count)),
aes (x = sample, y = count, fill = biotype)
) +
geom_bar (stat = "identity" , position = "fill" ) +
theme (axis.text.x = element_text (angle = 90 , vjust = 0.5 , hjust = 1 ))
p
small RNA gene counts
count only fragments that were properly aligned
annotate with GENCODE gene model
primary alignments were counted, even if the fragments aligned multiple times
fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
gene_counts_files <- paste0 (
human_counts_dir,
sample_units$ sample_unit,
"_first_proper_pair_gene_count.txt"
)
exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0 (
exogenous_counts_dir,
sample_units$ sample_unit,
"_idxstats.txt"
)
gene_counts <- readr:: read_tsv (
gene_counts_files[1 ],
comment = "#" ,
col_names = c ("gene" , gene_counts_files[1 ]),
col_types = "ci"
)
exogenous_counts <- read_tsv (
exogenous_counts_files[1 ],
col_names = c ("gene" , gene_counts_files[1 ]),
col_types = "c-i-"
)
gene_counts <- rbind (gene_counts, exogenous_counts)
for (i in 2 : length (gene_counts_files)) {
gene_sample <-
readr:: read_tsv (
gene_counts_files[i],
comment = "#" ,
col_names = c ("gene" , gene_counts_files[i]),
col_types = "ci"
)
exogenous_counts_sample <- read_tsv (
exogenous_counts_files[i],
col_names = c ("gene" , gene_counts_files[i]),
col_types = "c-i-"
)
gene_sample <- rbind (gene_sample, exogenous_counts_sample)
gene_counts <- gene_counts %>%
dplyr:: full_join (gene_sample, by = "gene" )
}
gene_counts <- gene_counts %>%
rename_all (~ stringr:: str_replace_all (
., human_counts_dir, ""
)) %>%
rename_all (~ stringr:: str_replace_all (., "_first_proper_pair_gene_count.txt" , "" ))
gene_counts
Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
dplyr:: filter (str_detect (gene, "^PJY" )) %>%
pull (gene)
Sample comparisons
Heatmap of sample-to-sample distances
Code
sample_dists <- dist (t (assay (vsd)))
sample_dist_matrix <- as.matrix (sample_dists)
rownames (sample_dist_matrix) <- paste (vsd$ cell_line,
vsd$ exogenous_rna,
vsd$ day,
sep = "-"
)
colnames (sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette (rev (brewer.pal (9 , "Blues" )))(255 )
pheatmap (sample_dist_matrix,
clustering_distance_rows = sample_dists,
clustering_distance_cols = sample_dists,
col = colors
)
PCA plot of samples
Code
plotPCA (vsd, intgroup = c ("cell_line" , "exogenous_rna" , "day" ))
Differetial Expression
Batch 2 - Day 1
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch2_day1 <- subset (dds, select = colData (dds)$ batch == "batch2" &
colData (dds)$ day == "day1" )
dds_batch2_day1$ exogenous_rna <- droplevels (dds_batch2_day1$ exogenous_rna)
dds_batch2_day1$ day <- droplevels (dds_batch2_day1$ day)
design (dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq (dds_batch2_day1, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56701 16
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results (dds_batch2_day1, alpha = 0.05 )
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56701 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
GAS5 3358381 0.1558718 0.0335180 4.650395 3.31299e-06
SNORD30 1605167 0.0910242 0.0576353 1.579315 1.14264e-01
SNORD49A 1156164 0.1842127 0.0436209 4.223034 2.41035e-05
SNORD26 857547 0.0954226 0.0539739 1.767939 7.70710e-02
SNHG1 780897 -0.0217632 0.0400106 -0.543936 5.86486e-01
... ... ... ... ... ...
ENSG00000275017 0 NA NA NA NA
ENSG00000214669 0 NA NA NA NA
ENSG00000276467 0 NA NA NA NA
MIR6134 0 NA NA NA NA
ENSG00000270413 0 NA NA NA NA
padj
<numeric>
GAS5 7.59058e-05
SNORD30 4.51373e-01
SNORD49A 4.75941e-04
SNORD26 3.61350e-01
SNHG1 8.79543e-01
... ...
ENSG00000275017 NA
ENSG00000214669 NA
ENSG00000276467 NA
MIR6134 NA
ENSG00000270413 NA
Code
out of 51617 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 936, 1.8%
LFC < 0 (down) : 1747, 3.4%
outliers [1] : 1, 0.0019%
low counts [2] : 23893, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch2_day1_lfc <- lfcShrink (dds_batch2_day1,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv" )
res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch2_day1_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, colour = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch2_day1_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0
) +
theme_bw ()
Warning: Removed 5084 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch2_day1_lfc_labelled$ gene) {
gene_prefix <- str_split (gene, "_" )[[1 ]][1 ]
d <- plotCounts (dds_batch2_day1,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (gene_prefix, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list)
Batch 2 - Day 2
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch2_day2 <- subset (dds, select = colData (dds)$ batch == "batch2" &
colData (dds)$ day == "day2" )
dds_batch2_day2$ exogenous_rna <- droplevels (dds_batch2_day2$ exogenous_rna)
dds_batch2_day2$ day <- droplevels (dds_batch2_day2$ day)
design (dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq (dds_batch2_day2, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56701 16
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results (dds_batch2_day2, alpha = 0.05 )
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56701 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
GAS5 3192415 0.18567762 0.0302504 6.13802 8.35556e-10
SNORD30 1514330 0.15582664 0.0510950 3.04974 2.29038e-03
SNORD49A 1170290 0.19302857 0.0427120 4.51931 6.20425e-06
SNORD26 825436 0.09512923 0.0397356 2.39406 1.66632e-02
SNHG1 678111 0.00437699 0.0310755 0.14085 8.87988e-01
... ... ... ... ... ...
ENSG00000275017 0.0500828 -0.164237 3.06038 -0.0536654 0.957202
ENSG00000214669 0.0500828 -0.164237 3.06038 -0.0536654 0.957202
ENSG00000276467 0.0500828 -0.164237 3.06038 -0.0536654 0.957202
MIR6134 0.0500828 -0.164237 3.06038 -0.0536654 0.957202
ENSG00000270413 0.0500828 -0.164237 3.06038 -0.0536654 0.957202
padj
<numeric>
GAS5 3.08070e-08
SNORD30 2.74383e-02
SNORD49A 1.36212e-04
SNORD26 1.33265e-01
SNHG1 9.77319e-01
... ...
ENSG00000275017 NA
ENSG00000214669 NA
ENSG00000276467 NA
MIR6134 NA
ENSG00000270413 NA
Code
out of 50450 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 876, 1.7%
LFC < 0 (down) : 1590, 3.2%
outliers [1] : 53, 0.11%
low counts [2] : 24293, 48%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch2_day2_lfc <- lfcShrink (dds_batch2_day2,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv" )
res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch2_day2_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, color = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch2_day2_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0
) +
theme_bw ()
Warning: Removed 6251 rows containing missing values (geom_point).
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch2_day2_lfc_labelled$ gene) {
gene_prefix <- str_split (gene, "_" )[[1 ]][1 ]
d <- plotCounts (dds_batch2_day2,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (gene_prefix, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list)
Batch 3 - Day 1
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch3_day1 <- subset (dds, select = colData (dds)$ batch == "batch3" &
colData (dds)$ day == "day1" )
dds_batch3_day1$ exogenous_rna <- droplevels (dds_batch3_day1$ exogenous_rna)
dds_batch3_day1$ day <- droplevels (dds_batch3_day1$ day)
design (dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq (dds_batch3_day1, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56701 12
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results (dds_batch3_day1, alpha = 0.05 )
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56701 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
GAS5 3961014 0.244497 0.0338285 7.227526 4.91872e-13
SNORD30 1821914 0.258996 0.0383083 6.760842 1.37192e-11
SNORD49A 1511998 0.249781 0.0372246 6.710113 1.94473e-11
SNORD26 978985 0.216188 0.0333114 6.489916 8.58842e-11
SNHG1 822961 0.013098 0.0370822 0.353214 7.23928e-01
... ... ... ... ... ...
ENSG00000275017 0 NA NA NA NA
ENSG00000214669 0 NA NA NA NA
ENSG00000276467 0 NA NA NA NA
MIR6134 0 NA NA NA NA
ENSG00000270413 0 NA NA NA NA
padj
<numeric>
GAS5 1.89349e-11
SNORD30 4.63230e-10
SNORD49A 6.49654e-10
SNORD26 2.67018e-09
SNHG1 9.29315e-01
... ...
ENSG00000275017 NA
ENSG00000214669 NA
ENSG00000276467 NA
MIR6134 NA
ENSG00000270413 NA
Code
out of 49746 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 859, 1.7%
LFC < 0 (down) : 1713, 3.4%
outliers [1] : 1, 0.002%
low counts [2] : 27764, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch3_day1_lfc <- lfcShrink (dds_batch3_day1,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv" )
res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch3_day1_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, colour = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch3_day1_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0
) +
theme_bw ()
Warning: Removed 6955 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch3_day1_lfc_labelled$ gene) {
exogenous_rna <- rna_mixes %>%
dplyr:: filter (rna_species == gene) %>%
pull (exogenous_rna)
exogenous_rna_regex <- paste (exogenous_rna, collapse = "|" )
d <- plotCounts (dds_batch3_day1,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (exogenous_rna_regex, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list, ncol = 2 )
Batch 3 - Day 2
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch3_day2 <- subset (dds, select = colData (dds)$ batch == "batch3" &
colData (dds)$ day == "day2" )
dds_batch3_day2$ exogenous_rna <- droplevels (dds_batch3_day2$ exogenous_rna)
dds_batch3_day2$ day <- droplevels (dds_batch3_day2$ day)
design (dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq (dds_batch3_day2, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56701 12
metadata(1): version
assays(4): counts mu H cooks
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results (dds_batch3_day2, alpha = 0.05 )
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56701 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
GAS5 4358469 0.3663917 0.0342507 10.697334 1.04748e-26
SNORD30 2397520 0.3288890 0.0409048 8.040348 8.95837e-16
SNORD49A 1753234 0.4149990 0.0478249 8.677475 4.04651e-18
SNORD26 1116912 0.2291981 0.0406001 5.645253 1.64939e-08
SNHG1 818057 0.0377487 0.0417886 0.903324 3.66354e-01
... ... ... ... ... ...
ENSG00000275017 0 NA NA NA NA
ENSG00000214669 0 NA NA NA NA
ENSG00000276467 0 NA NA NA NA
MIR6134 0 NA NA NA NA
ENSG00000270413 0 NA NA NA NA
padj
<numeric>
GAS5 9.49256e-25
SNORD30 4.16929e-14
SNORD49A 2.25468e-16
SNORD26 4.16947e-07
SNHG1 7.16437e-01
... ...
ENSG00000275017 NA
ENSG00000214669 NA
ENSG00000276467 NA
MIR6134 NA
ENSG00000270413 NA
Code
out of 50021 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 1163, 2.3%
LFC < 0 (down) : 2110, 4.2%
outliers [1] : 6, 0.012%
low counts [2] : 26000, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch3_day2_lfc <- lfcShrink (dds_batch3_day2,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv" )
res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch3_day2_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, colour = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch3_day2_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0
) +
theme_bw ()
Warning: Removed 6680 rows containing missing values (geom_point).
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch3_day2_lfc_labelled$ gene) {
exogenous_rna <- rna_mixes %>%
dplyr:: filter (rna_species == gene) %>%
pull (exogenous_rna)
exogenous_rna_regex <- paste (exogenous_rna, collapse = "|" )
d <- plotCounts (dds_batch3_day2,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (exogenous_rna_regex, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list, ncol = 2 )